1 Parkinson’s PCA-EFA-UPSTM

1.0.1 Loading the libraries

library("FRESA.CAD")
library(psych)
library(whitening)
library("vioplot")

library(readxl)
op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)

1.1 The Data


pd_speech_features <- as.data.frame(read_excel("~/GitHub/FCA/Data/pd_speech_features.xlsx",sheet = "pd_speech_features", range = "A2:ACB758"))

1.1.1 The Average of the Three Repetitions

Each subject had three repeated observations. Here I’ll use the average of the three experiments per subject.

rep1Parkison <- subset(pd_speech_features,RID==1)
rownames(rep1Parkison) <- rep1Parkison$id
rep1Parkison$id <- NULL
rep1Parkison$RID <- NULL
rep1Parkison[,1:ncol(rep1Parkison)] <- sapply(rep1Parkison,as.numeric)

rep2Parkison <- subset(pd_speech_features,RID==2)
rownames(rep2Parkison) <- rep2Parkison$id
rep2Parkison$id <- NULL
rep2Parkison$RID <- NULL
rep2Parkison[,1:ncol(rep2Parkison)] <- sapply(rep2Parkison,as.numeric)

rep3Parkison <- subset(pd_speech_features,RID==3)
rownames(rep3Parkison) <- rep3Parkison$id
rep3Parkison$id <- NULL
rep3Parkison$RID <- NULL
rep3Parkison[,1:ncol(rep3Parkison)] <- sapply(rep3Parkison,as.numeric)

whof <- !(colnames(rep1Parkison) %in% c("gender","class"));
avgParkison <- rep1Parkison;
avgParkison[,whof] <- (rep1Parkison[,whof] + rep2Parkison[,whof] + rep3Parkison[,whof])/3

## I apply the log transform to the data
##avgParkison[,whof] <- signedlog(avgParkison[,whof])
#avgParkison[,whof] <- FRESAScale(avgParkison[,whof],method="OrderLogit")$scaledData
#pander::pander(table(avgParkison$class))

signedlog <- function(x) { return (sign(x)*log(abs(1.0e12*x)+1.0))}
whof <- !(colnames(avgParkison) %in% c("gender","class"));
avgParkison[,whof] <- signedlog(avgParkison[,whof])

1.1.1.1 Standarize the names for the reporting

dataframe <- avgParkison
outcome <- "class"

trainFraction <- 0.5
rhoThreshold <- 0.6
TopVariables <- 5
aucTHR <- 0.55

set.seed(5)
trainSample <- sample(nrow(dataframe),nrow(dataframe)*trainFraction)

trainDataFrame <- dataframe[trainSample,]
testDataFrame <- dataframe[-trainSample,]

1.1.2 Data specs

pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
rows col
252 753
pander::pander(table(dataframe[,outcome]))
0 1
64 188
pander::pander(table(trainDataFrame[,outcome]))
0 1
32 94
pander::pander(table(testDataFrame[,outcome]))
0 1
32 94

varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]

1.2 Univariate


univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","IDI","ROCAUC","cStatCorr")
univar <- uniRankVar(varlist,
               paste(outcome,"~1"),
               outcome,
               trainDataFrame,
              rankingTest = "CStat")

100 : std_MFCC_1st_coef 200 : app_entropy_shannon_10_coef 300 : app_LT_entropy_log_9_coef 400 : tqwt_entropy_log_dec_7 500 : tqwt_TKEO_std_dec_35
600 : tqwt_stdValue_dec_27 700 : tqwt_skewnessValue_dec_19



#univar$orderframe[1:5,univariate_columns]
univarTest <- uniRankVar(varlist,
               paste(outcome,"~1"),
               outcome,
               trainDataFrame,
               testData=testDataFrame,
              rankingTest = "CStat")

100 : std_MFCC_1st_coef 200 : app_entropy_shannon_10_coef 300 : app_LT_entropy_log_9_coef 400 : tqwt_entropy_log_dec_7 500 : tqwt_TKEO_std_dec_35
600 : tqwt_stdValue_dec_27 700 : tqwt_skewnessValue_dec_19


univar$orderframe$BACC <- (univar$orderframe$Sensitivity + univar$orderframe$Specificity)/2.0
univarTest$orderframe$BACC <- (univarTest$orderframe$Sensitivity + univarTest$orderframe$Specificity)/2.0

#pROC::roc(trainDataFrame$class,trainDataFrame[,univar$orderframe$Name[1]],direction=">",plot=TRUE,auc=TRUE,quiet = TRUE)

1.3 Decorrelation with UPSTM Blind

DEdataframe <- IDeA(trainDataFrame,thr=rhoThreshold)
predTestDe <- predictDecorrelate(DEdataframe,testDataFrame)

ltvar <- getLatentCoefficients(DEdataframe);
pander::pander(head(ltvar))
  • La_PPE:

    PPE det_LT_TKEO_mean_10_coef
    1 0.0584
  • La_DFA:

    DFA IMF_SNR_SEO
    1 -0.0426
  • La_RPDE:

    RPDE rapJitter
    1 -0.226
  • La_numPulses:

    numPulses app_LT_entropy_shannon_8_coef
    1 2.12
  • La_numPeriodsPulses:

    numPulses numPeriodsPulses app_entropy_shannon_1_coef app_entropy_shannon_3_coef app_det_TKEO_mean_8_coef
    -1.01 1 -0.083 0.123 0.0423
  • La_meanPeriodPulses:

    numPulses numPeriodsPulses meanPeriodPulses
    11.9 -10.9 1
pander::pander(c(Avlen=mean(sapply(ltvar,length))))
Avlen
3.84
pander::pander(c(Latent=length(ltvar)))
Latent
547


varlistDe <-  colnames(DEdataframe)[colnames(DEdataframe) != outcome];
univarDe <- uniRankVar(varlistDe,
              paste(outcome,"~1"),
                outcome,
              DEdataframe,
              rankingTest = "CStat")

100 : La_std_MFCC_1st_coef 200 : La_app_entropy_shannon_10_coef 300 : La_app_LT_entropy_log_9_coef 400 : La_tqwt_entropy_log_dec_7 500 : La_tqwt_TKEO_std_dec_35
600 : La_tqwt_stdValue_dec_27 700 : tqwt_skewnessValue_dec_19


univarDeTest <- uniRankVar(varlistDe,
              paste(outcome,"~1"),
                outcome,
              DEdataframe,
              testData=predTestDe,
              rankingTest = "CStat")

100 : La_std_MFCC_1st_coef 200 : La_app_entropy_shannon_10_coef 300 : La_app_LT_entropy_log_9_coef 400 : La_tqwt_entropy_log_dec_7 500 : La_tqwt_TKEO_std_dec_35
600 : La_tqwt_stdValue_dec_27 700 : tqwt_skewnessValue_dec_19


univarDe$orderframe$BACC <- (univarDe$orderframe$Sensitivity + univarDe$orderframe$Specificity)/2.0
univarDeTest$orderframe$BACC <- (univarDeTest$orderframe$Sensitivity + univarDeTest$orderframe$Specificity)/2.0

#univarDe$orderframe[1:5,univariate_columns]
#univarDeTest$orderframe[1:5,univariate_columns]

#pROC::roc(DEdataframe$class,DEdataframe[,univarDe$orderframe$Name[1]],direction=">",plot=TRUE,auc=TRUE,quiet = TRUE)

1.4 Decorrelation with UPSTM Blind/Spearman

DEdataframeSpear <- IDeA(trainDataFrame,thr=rhoThreshold,method="spearman")
predTestDeSpear <- predictDecorrelate(DEdataframeSpear,testDataFrame)

ltvar <- getLatentCoefficients(DEdataframeSpear);
pander::pander(head(ltvar))
  • La_PPE:

    PPE Ed_3_coef
    1 0.0585
  • La_DFA:

    DFA IMF_SNR_SEO
    1 -0.0426
  • La_numPulses:

    numPulses app_LT_entropy_shannon_8_coef
    1 2.12
  • La_numPeriodsPulses:

    numPulses numPeriodsPulses app_entropy_shannon_1_coef app_LT_entropy_shannon_8_coef
    -1.01 1 -0.0215 0.101
  • La_meanPeriodPulses:

    numPulses meanPeriodPulses
    0.934 1
  • La_stdDevPeriodPulses:

    numPulses stdDevPeriodPulses app_LT_entropy_shannon_8_coef
    4.2 1 8.92
pander::pander(c(Avlen=mean(sapply(ltvar,length))))
Avlen
3.29
pander::pander(c(Latent=length(ltvar)))
Latent
552


varlistDeSpear <-  colnames(DEdataframeSpear)[colnames(DEdataframeSpear) != outcome];
univarDeSpear <- uniRankVar(varlistDeSpear,
              paste(outcome,"~1"),
                outcome,
              DEdataframeSpear,
              rankingTest = "CStat")

100 : La_std_MFCC_1st_coef 200 : La_app_entropy_shannon_10_coef 300 : La_app_LT_entropy_log_9_coef 400 : La_tqwt_entropy_log_dec_7 500 : La_tqwt_TKEO_std_dec_35
600 : La_tqwt_stdValue_dec_27 700 : tqwt_skewnessValue_dec_19


univarDeSpearTest <- uniRankVar(varlistDeSpear,
              paste(outcome,"~1"),
                outcome,
              DEdataframeSpear,
              testData=predTestDeSpear,
              rankingTest = "CStat")

100 : La_std_MFCC_1st_coef 200 : La_app_entropy_shannon_10_coef 300 : La_app_LT_entropy_log_9_coef 400 : La_tqwt_entropy_log_dec_7 500 : La_tqwt_TKEO_std_dec_35
600 : La_tqwt_stdValue_dec_27 700 : tqwt_skewnessValue_dec_19


univarDeSpear$orderframe$BACC <- (univarDeSpear$orderframe$Sensitivity + univarDeSpear$orderframe$Specificity)/2.0
univarDeSpearTest$orderframe$BACC <- (univarDeSpearTest$orderframe$Sensitivity + univarDeSpearTest$orderframe$Specificity)/2.0

1.5 Decorrelation with UPSTM Driven


DriDEdataframe <- IDeA(trainDataFrame,Outcome=outcome,thr=rhoThreshold)
predTestDri <- predictDecorrelate(DriDEdataframe,testDataFrame)


ltvar <- getLatentCoefficients(DriDEdataframe);
pander::pander(head(ltvar))
  • La_numPulses:

    numPulses VFER_entropy
    1 -0.183
  • La_numPeriodsPulses:

    numPulses numPeriodsPulses app_entropy_shannon_4_coef app_LT_entropy_shannon_8_coef
    -1.01 1 -0.0279 0.132
  • La_meanPeriodPulses:

    numPulses numPeriodsPulses meanPeriodPulses app_entropy_shannon_4_coef app_LT_entropy_shannon_8_coef
    15.9 -14.9 1 0.415 -1.96
  • La_locPctJitter:

    RPDE locPctJitter
    -1.97 1
  • La_locAbsJitter:

    locPctJitter locAbsJitter
    -1.25 1
  • La_rapJitter:

    locPctJitter rapJitter
    -1.29 1
pander::pander(c(Avlen=mean(sapply(ltvar,length))))
Avlen
3.94
pander::pander(c(Latent=length(ltvar)))
Latent
547


varlistDe <-  colnames(DriDEdataframe)[colnames(DriDEdataframe) != outcome];
univarDeDri <- uniRankVar(varlistDe,
              paste(outcome,"~1"),
                outcome,
              DriDEdataframe,
              rankingTest = "CStat")

100 : La_std_MFCC_1st_coef 200 : La_app_entropy_shannon_10_coef 300 : La_app_LT_entropy_log_9_coef 400 : La_tqwt_entropy_log_dec_7 500 : La_tqwt_TKEO_std_dec_35
600 : La_tqwt_stdValue_dec_27 700 : tqwt_skewnessValue_dec_19


univarDeDriTest <- uniRankVar(varlistDe,
              paste(outcome,"~1"),
                outcome,
              DriDEdataframe,
              testData=predTestDri,
              rankingTest = "CStat")

100 : La_std_MFCC_1st_coef 200 : La_app_entropy_shannon_10_coef 300 : La_app_LT_entropy_log_9_coef 400 : La_tqwt_entropy_log_dec_7 500 : La_tqwt_TKEO_std_dec_35
600 : La_tqwt_stdValue_dec_27 700 : tqwt_skewnessValue_dec_19


univarDeDri$orderframe$BACC <- (univarDeDri$orderframe$Sensitivity + univarDeDri$orderframe$Specificity)/2.0
univarDeDriTest$orderframe$BACC <- (univarDeDriTest$orderframe$Sensitivity + univarDeDriTest$orderframe$Specificity)/2.0

1.6 Decorrelation with UPSTM Driven and Spearman


DriDEdataframeSpear <- IDeA(trainDataFrame,Outcome=outcome,thr=rhoThreshold,method="spearman")
predTestDriSpear <- predictDecorrelate(DriDEdataframeSpear,testDataFrame)


ltvar <- getLatentCoefficients(DriDEdataframeSpear);
pander::pander(head(ltvar))
  • La_PPE:

    PPE Ed_3_coef
    1 0.0585
  • La_RPDE:

    RPDE Ed_3_coef
    1 -0.0561
  • La_numPulses:

    numPulses Ed_5_coef det_entropy_shannon_5_coef
    1 0.389 0.337
  • La_numPeriodsPulses:

    numPulses numPeriodsPulses app_entropy_shannon_1_coef app_LT_entropy_shannon_8_coef
    -1.01 1 -0.0215 0.101
  • La_meanPeriodPulses:

    numPulses meanPeriodPulses
    0.934 1
  • La_locPctJitter:

    RPDE locPctJitter
    -1.97 1
pander::pander(c(Avlen=mean(sapply(ltvar,length))))
Avlen
3.38
pander::pander(c(Latent=length(ltvar)))
Latent
542


varlistDeSpear <-  colnames(DriDEdataframeSpear)[colnames(DriDEdataframeSpear) != outcome];
univarDeDriSpear <- uniRankVar(varlistDeSpear,
              paste(outcome,"~1"),
                outcome,
              DriDEdataframeSpear,
              rankingTest = "CStat")

100 : La_std_MFCC_1st_coef 200 : La_app_entropy_shannon_10_coef 300 : La_app_LT_entropy_log_9_coef 400 : La_tqwt_entropy_log_dec_7 500 : La_tqwt_TKEO_std_dec_35
600 : La_tqwt_stdValue_dec_27 700 : tqwt_skewnessValue_dec_19


univarDeDriSpearTest <- uniRankVar(varlistDeSpear,
              paste(outcome,"~1"),
                outcome,
              DriDEdataframeSpear,
              testData=predTestDriSpear,
              rankingTest = "CStat")

100 : La_std_MFCC_1st_coef 200 : La_app_entropy_shannon_10_coef 300 : La_app_LT_entropy_log_9_coef 400 : La_tqwt_entropy_log_dec_7 500 : La_tqwt_TKEO_std_dec_35
600 : La_tqwt_stdValue_dec_27 700 : tqwt_skewnessValue_dec_19


univarDeDriSpear$orderframe$BACC <- (univarDeDriSpear$orderframe$Sensitivity + univarDeDriSpear$orderframe$Specificity)/2.0
univarDeDriSpearTest$orderframe$BACC <- (univarDeDriSpearTest$orderframe$Sensitivity + univarDeDriSpearTest$orderframe$Specificity)/2.0

1.6.1 Get continous correlated features

iscontinous <- sapply(apply(trainDataFrame,2,unique),length) > 5 ## Only variables with enough samples

noclassData <- trainDataFrame[,iscontinous]
cmat <- cor(noclassData);
diag(cmat) <- 0;
maxcor <- apply(cmat>rhoThreshold,2,sum);
topcor <- names(maxcor[maxcor > 0]) ## Only correlated features will be PCA
pander::pander(c(Ncor=length(topcor)))
Ncor
584
cmat <- NULL

notcorr <- colnames(trainDataFrame)[!(colnames(trainDataFrame) %in% topcor)]
noclassData <- noclassData[,topcor]
noclassTestData <- testDataFrame[,topcor]

1.7 PCA Analysis


### PCA 

pc <- principal(noclassData,4*TopVariables,rotate="varimax")   #principal components
pander::pander(t(pc$loadings[1:TopVariables,1:TopVariables]))
  PPE DFA RPDE numPulses numPeriodsPulses
RC1 0.1180 -0.4531 -0.0400 0.05236 0.05316
RC2 0.0183 -0.1656 -0.4659 0.88819 0.88572
RC3 -0.8424 0.0622 0.5331 -0.19679 -0.20273
RC4 -0.0825 0.1254 -0.0145 0.06517 0.06564
RC5 0.1080 -0.0133 0.3107 -0.00808 -0.00887
PCA_Train <- as.data.frame(cbind(predict(pc,noclassData),trainDataFrame[,notcorr]))
colnames(PCA_Train) <- c(colnames(predict(pc,noclassData)),notcorr)

PCA_Predicted <- as.data.frame(cbind(predict(pc,noclassTestData),testDataFrame[,notcorr]))
colnames(PCA_Predicted) <- c(colnames(predict(pc,noclassTestData)),notcorr)

iscontinous <- colnames(PCA_Predicted)[sapply(apply(PCA_Predicted,2,unique),length) > 5] ## Only variables with enough samples
varlistPCA <-  iscontinous;

univarPCA <- uniRankVar(varlistPCA,
              paste(outcome,"~1"),
                outcome,
              PCA_Train,
              rankingTest = "CStat")

100 : tqwt_medianValue_dec_13


univarPCATest <- uniRankVar(varlistPCA,
              paste(outcome,"~1"),
                outcome,
              PCA_Train,
              testData=PCA_Predicted,
              rankingTest = "CStat")

100 : tqwt_medianValue_dec_13


univarPCA$orderframe$BACC <- (univarPCA$orderframe$Sensitivity + univarPCA$orderframe$Specificity)/2.0
univarPCATest$orderframe$BACC <- (univarPCATest$orderframe$Sensitivity + univarPCATest$orderframe$Specificity)/2.0

1.8 EFA


uls <- fa(noclassData,4*TopVariables,rotate="varimax")  #unweighted least squares is minres 
pander::pander(t(uls$weights[1:TopVariables,1:TopVariables])) 
  PPE DFA RPDE numPulses numPeriodsPulses
MR1 0.1174 -0.4571 -0.0375 0.05178 0.0525
MR2 0.0183 -0.1641 -0.4643 0.88673 0.8843
MR3 -0.8394 0.0636 0.5311 -0.19621 -0.2022
MR4 -0.0821 0.1230 -0.0151 0.06501 0.0654
MR5 0.1061 -0.0180 0.3104 -0.00854 -0.0093
EFA_Train <- as.data.frame(cbind(predict(uls,noclassData),trainDataFrame[,notcorr]))
colnames(EFA_Train) <- c(colnames(predict(uls,noclassData)),notcorr)
EFA_Predicted <- as.data.frame(cbind(predict(uls,noclassTestData),testDataFrame[,notcorr]))
colnames(EFA_Predicted) <- c(colnames(predict(uls,noclassTestData)),notcorr)

iscontinous <- colnames(EFA_Predicted)[sapply(apply(EFA_Predicted,2,unique),length) > 5] ## Only variables with enough 
varlistEFA <-  iscontinous
univarEFA <- uniRankVar(varlistEFA,
              paste(outcome,"~1"),
                outcome,
              EFA_Train,
              rankingTest = "CStat")

100 : tqwt_medianValue_dec_13


univarEFATest <- uniRankVar(varlistEFA,
              paste(outcome,"~1"),
                outcome,
              EFA_Train,
              testData=EFA_Predicted,
              rankingTest = "CStat")

100 : tqwt_medianValue_dec_13


univarEFA$orderframe$BACC <- (univarEFA$orderframe$Sensitivity + univarEFA$orderframe$Specificity)/2.0
univarEFATest$orderframe$BACC <- (univarEFATest$orderframe$Sensitivity + univarEFATest$orderframe$Specificity)/2.0

1.9 White

WhiteMat = whiteningMatrix(cov(noclassData), method="PCA")
sum(is.na(WhiteMat))

[1] 120888

tokeep <- apply(is.na(WhiteMat),1,sum) == 0
WhiteMat <- WhiteMat[tokeep,]
sum(is.na(WhiteMat))

[1] 0

sum(apply(abs(WhiteMat),1,sum) > 1.0e6)

[1] 252

tokeep <- apply(abs(WhiteMat),1,sum) < 1.0e6
WhiteMat <- WhiteMat[tokeep,]
sum(apply(abs(WhiteMat),1,sum) > 1.0e6)

[1] 0


pander::pander(c(ncol=ncol(WhiteMat),nrow=nrow(WhiteMat)))
ncol nrow
584 125

pander::pander(WhiteMat[1:TopVariables,1:TopVariables]) 
  PPE DFA RPDE numPulses numPeriodsPulses
L1 3.12e-05 -2.67e-06 -2.86e-05 4.63e-06 5.02e-06
L2 1.85e-06 8.93e-08 -8.94e-07 1.88e-05 1.91e-05
L3 3.91e-07 3.20e-06 7.69e-06 -1.23e-05 -1.19e-05
L4 6.20e-05 1.32e-06 -3.05e-05 6.09e-05 6.22e-05
L5 -2.59e-05 -3.48e-05 -3.48e-05 7.50e-05 7.63e-05
PCAWhite_Train <- as.data.frame(cbind(tcrossprod(as.matrix(noclassData), WhiteMat),trainDataFrame[,notcorr]))
colnames(PCAWhite_Train) <- c(colnames(tcrossprod(as.matrix(noclassData), WhiteMat)),notcorr)

sum(is.na(PCAWhite_Train))

[1] 0




PCAWhitePredicted <- as.data.frame(cbind(tcrossprod(as.matrix(noclassTestData), WhiteMat),testDataFrame[,notcorr]))
colnames(PCAWhitePredicted) <- c(colnames(tcrossprod(as.matrix(noclassTestData), WhiteMat)),notcorr)

sum(is.na(PCAWhitePredicted))

[1] 0


iscontinous <- colnames(PCAWhitePredicted)[sapply(apply(PCAWhitePredicted,2,unique),length) > 5] ## Only variables with enough 
varlistWhite <-  iscontinous

univarWhite <- uniRankVar(varlistWhite,
              paste(outcome,"~1"),
                outcome,
              PCAWhite_Train,
              rankingTest = "CStat")

100 : L100 200 : tqwt_medianValue_dec_8



univarWhiteTest <- uniRankVar(varlistWhite,
              paste(outcome,"~1"),
                outcome,
              PCAWhite_Train,
              testData=PCAWhitePredicted,
              rankingTest = "CStat")

100 : L100 200 : tqwt_medianValue_dec_8


univarWhite$orderframe$BACC <- (univarWhite$orderframe$Sensitivity + univarWhite$orderframe$Specificity)/2.0
univarWhiteTest$orderframe$BACC <- (univarWhiteTest$orderframe$Sensitivity + univarWhiteTest$orderframe$Specificity)/2.0

1.10 Correlation Matrices

1.10.1 RAW

par(cex=1.0,cex.main=0.8)
breaks <- c(0:5)/5.0;

cormat <- cor(testDataFrame,method="spearman")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;
pander::pander(max(abs(cormat)))

1

pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.517 0.694 0.834 0.954 0.997
pander::pander(c(Raw_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
Raw_fraction
0.0729

gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  breaks = breaks,
                  main = "Raw Correlation",
                  cexRow = 0.25,
                  cexCol = 0.25,
                  srtCol=35,
                  srtRow=75,
                  key.title=NA,
                  key.xlab="Spearman Correlation",
                  xlab="Feature", ylab="Feature"
                  )


#hist(cormat,freq=FALSE,
#     density=NULL,
#     xlim=c(-1,1),
#     ylim=c(0,4.0),
#     main="Raw Correlation",xlab="Spearman Correlation")

rawDen <- density(cormat,from=-1,to=1)
par(op)

1.10.2 UPSTM Blind

par(cex=1.0,cex.main=0.8)

## Train Correlation

cormat <- cor(DEdataframe,method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

#pander::pander(colnames(cormat)[apply(abs(cormat),2,max)>rhoThreshold])

pander::pander(c(Train=max(abs(cormat))))
Train
0.6
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.184 0.228 0.275 0.347 0.519
pander::pander(c(IDeA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
IDeA_fraction
0

## Test Correlation
cormat <- cor(predTestDe,method="spearman")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Test=max(abs(cormat))))
Test
0.943
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.21 0.269 0.332 0.422 0.636
pander::pander(c(IDeA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
IDeA_fraction
0.00148

gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  breaks = breaks,
                  main = "Correlation after IDeA",
                  cexRow = 0.25,
                  cexCol = 0.25,
                  srtCol=35,
                  srtRow=75,
                  key.title=NA,
                  key.xlab="Spearman Correlation",
                  xlab="Feature", ylab="Feature")


#hist(cormat,freq=FALSE,
#     density=NULL,
#     xlim=c(-1,1),
#     ylim=c(0,4.0),
#     main="Correlation after UPSTM",xlab="Spearman Correlation")

DeDen <- density(cormat,from=-1,to=1)


par(op)

1.10.3 UPSTM Blind/Spearman

par(cex=1.0,cex.main=0.8)

## Train Correlation

cormat <- cor(DEdataframeSpear,method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

#pander::pander(colnames(cormat)[apply(abs(cormat),2,max)>rhoThreshold])

pander::pander(c(Train=max(abs(cormat))))
Train
0.983
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.19 0.241 0.3 0.391 0.659
pander::pander(c(IDeA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
IDeA_fraction
0.00163

## Test Correlation
cormat <- cor(predTestDeSpear,method="spearman")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Test=max(abs(cormat))))
Test
0.992
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.212 0.271 0.336 0.425 0.629
pander::pander(c(IDeA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
IDeA_fraction
0.0014

gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  breaks = breaks,
                  main = "Correlation after IDeA:Spearman",
                  cexRow = 0.25,
                  cexCol = 0.25,
                  srtCol=35,
                  srtRow=75,
                  key.title=NA,
                  key.xlab="Spearman Correlation",
                  xlab="Feature", ylab="Feature")


#hist(cormat,freq=FALSE,
#     density=NULL,
#     xlim=c(-1,1),
#     ylim=c(0,4.0),
#     main="Correlation after UPSTM",xlab="Spearman Correlation")

DeSpearDen <- density(cormat,from=-1,to=1)

par(op)

1.10.4 UPSTM Driven

par(cex=1.0,cex.main=0.8)

## Train Correlation

cormat <- cor(DriDEdataframe,method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;
#pander::pander(colnames(cormat)[apply(abs(cormat),2,max)>rhoThreshold])

pander::pander(c(Train=max(abs(cormat))))
Train
0.6
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.185 0.23 0.278 0.349 0.515
pander::pander(c(IDeA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
IDeA_fraction
0

## Test Correlation
cormat <- cor(DriDEdataframe,method="spearman")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Test=max(abs(cormat))))
Test
0.889
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.195 0.245 0.301 0.383 0.565
pander::pander(c(IDeA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
IDeA_fraction
0.000584

gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  breaks = breaks,
                  main = "Correlation after Driven-IDeA",
                  cexRow = 0.25,
                  cexCol = 0.25,
                  srtCol=35,
                  srtRow=75,
                  key.title=NA,
                  key.xlab="Spearman Correlation",
                  xlab="Feature", ylab="Feature")

#hist(cormat,freq=FALSE,
#     density=NULL,
#     xlim=c(-1,1),
#     ylim=c(0,4.0),
#     main="Correlation after Driven-UPSTM",xlab="Spearman Correlation")

DeDrivDen <- density(cormat,from=-1,to=1)
par(op)

1.10.5 UPSTM Spearman

par(cex=1.0,cex.main=0.8)

## Train Correlation

cormat <- cor(DriDEdataframeSpear,method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;
#pander::pander(colnames(cormat)[apply(abs(cormat),2,max)>rhoThreshold])

pander::pander(c(Train=max(abs(cormat))))
Train
0.972
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.191 0.242 0.301 0.397 0.662
pander::pander(c(IDeAS_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
IDeAS_fraction
0.00177

## Test Correlation

cormat <- cor(predTestDriSpear,method="spearman")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Test=max(abs(cormat))))
Test
0.994
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.213 0.273 0.339 0.432 0.667
pander::pander(c(IDeAS_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
IDeAS_fraction
0.00204

gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  breaks = breaks,
                  main = "Correlation: Driven/Spearman",
                  cexRow = 0.25,
                  cexCol = 0.25,
                  srtCol=35,
                  srtRow=75,
                  key.title=NA,
                  key.xlab="Spearman Correlation",
                  xlab="Feature", ylab="Feature")


#hist(cormat,freq=FALSE,
#     density=NULL,
#     xlim=c(-1,1),
#     ylim=c(0,4.0),
#     main="Correlation after UPSTM with Spearman",xlab="Spearman Correlation")

DeDrivSpearDen <- density(cormat,from=-1,to=1)
par(op)

1.10.6 PCA

par(cex=1.0,cex.main=0.8)



## Train Correlation

cormat <- cor(PCA_Train,method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Train=max(abs(cormat))))
Train
0.84
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.164 0.202 0.243 0.334 0.616
pander::pander(c(PCA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
PCA_fraction
0.00122

## Test Correlation
cormat <- cor(PCA_Predicted,method="spearman")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Test=max(abs(cormat))))
Test
0.936
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.175 0.226 0.302 0.455 0.781
pander::pander(c(PCA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
PCA_fraction
0.00493

gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  breaks = breaks,
                  main = "Correlation after PCA",
                  cexRow = 0.25,
                  cexCol = 0.25,
                  srtCol=35,
                  srtRow=75,
                  key.title=NA,
                  key.xlab="Spearman Correlation",
                  xlab="Feature", ylab="Feature")



#hist(cormat,freq=FALSE,
#     density=NULL,
#     xlim=c(-1,1),
#     ylim=c(0,4.0),
#     main="Correlation after PCA",xlab="Spearman Correlation")

PCADen <- density(cormat,from=-1,to=1)

par(op)

1.10.7 EFA

par(cex=1.0,cex.main=0.8)

## Train Correlation

cormat <- cor(EFA_Train,method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Train=max(abs(cormat))))
Train
0.843
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.165 0.202 0.244 0.336 0.627
pander::pander(c(EFA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
EFA_fraction
0.00122

## Test Correlation
cormat <- cor(EFA_Predicted,method="spearman")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Test=max(abs(cormat))))
Test
0.936
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.175 0.226 0.301 0.455 0.781
pander::pander(c(EFA_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
EFA_fraction
0.0051

gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  breaks = breaks,
                  main = "Correlation after EFA",
                  cexRow = 0.25,
                  cexCol = 0.25,
                  srtCol=35,
                  srtRow=75,
                  key.title=NA,
                  key.xlab="Spearman Correlation",
                  xlab="Feature", ylab="Feature")



#hist(cormat,freq=FALSE,
#     density=NULL,
#     xlim=c(-1,1),
#     ylim=c(0,4.0),
#     main="Correlation after EFA",xlab="Spearman Correlation")

EFADen <- density(cormat,from=-1,to=1)
par(op)

1.10.8 PCA Whitening



## Train Correlation

cormat <- cor(PCAWhite_Train,method="pearson")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Train=max(abs(cormat))))
Train
0.771
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.141 0.172 0.201 0.237 0.375
pander::pander(c(PCAWhite_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
PCAWhite_fraction
4.6e-05

## Test Correlation
cormat <- cor(PCAWhitePredicted,method="spearman")
cormat[is.na(cormat)] <- 0
diag(cormat) <- 0;

pander::pander(c(Test=max(abs(cormat))))
Test
0.936
pander::pander(quantile(abs(cormat),probs=c(0.9,0.95,0.975,0.99,0.999)))
90% 95% 97.5% 99% 99.9%
0.176 0.216 0.254 0.311 0.582
pander::pander(c(PCAWhite_fraction=sum(abs(cormat)>rhoThreshold)/ncol(cormat)/ncol(cormat)))
PCAWhite_fraction
0.000896

gplots::heatmap.2(abs(cormat),
                  trace = "none",
                  mar = c(5,5),
                  col=rev(heat.colors(5)),
                  breaks = breaks,
                  main = "Correlation after PCAWhite",
                  cexRow = 0.25,
                  cexCol = 0.25,
                  srtCol=35,
                  srtRow=75,
                  key.title=NA,
                  key.xlab="Spearman Correlation",
                  xlab="Feature", ylab="Feature")



#hist(cormat,freq=FALSE,
#     density=NULL,
#     xlim=c(-1,1),
#     ylim=c(0,4.0),
#     main="Correlation after PCAWhite",xlab="Spearman Correlation")

WhiteDen <- density(cormat,from=-1,to=1)
par(op)

1.11 The Density Plot

par(cex=0.7)
colors=c("red","blue","green","darkblue","darkgreen","purple","orange","darkred");

plot(rawDen,
     xlim=c(-1,1),
     ylim=c(0.001,7.0),
     col=colors[1],
     lty=1,
     lwd=4,
     log="y",
     main="Test: Correlation Distribution",xlab="Spearman Correlation")

lines(DeDen,col=colors[2],lty=2,lwd=4)
lines(DeSpearDen,col=colors[3],lty=3,lwd=4)
lines(DeDrivDen,col=colors[4],lty=4,lwd=2)
lines(DeDrivSpearDen,col=colors[5],lty=5,lwd=2)

lines(PCADen,col=colors[6],lty=6,lwd=1)
lines(EFADen,col=colors[7],lty=7,lwd=1)
lines(WhiteDen,col=colors[8],lty=8,lwd=1)

names=c("Raw","IDeA:P","IDeA:S","DIDeA:P","DIDeA:S","PCA","EFA","White:PCA")
#colors=c("red","blue","green","blue","green","purple","purple","gray");
lines=c(1,2,3,4,5,6,7,8)
lwds=c(4,4,4,2,2,1,1,1)

legend("topleft",names,col=colors,lty=lines,lwd=lwds,cex=0.50)

par(op)

1.11.1 Differences between train and test ROC AUC

par(op)
par(mfrow=c(1,1),cex=0.7)

AUCResults <- list();
diffAUC <- list();
thenames <- rownames(univar$orderframe)[(rownames(univar$orderframe) %in% colnames(testDataFrame))]
rawAUC <- univar$orderframe[thenames,"ROCAUC"]
thenames <- thenames[rawAUC >= aucTHR]
rawAUC <- univar$orderframe[thenames,"ROCAUC"]
rawAUCTest <- univarTest$orderframe[thenames,"ROCAUC"]
AUCResults$RAW <- rawAUCTest
diffAUC$RAW <-  rawAUCTest-rawAUC
plot(rawAUC,rawAUCTest-rawAUC,
     xlab="TRAIN ROC AUC",
     ylab="Test:AUC-Train:AUC",
     xlim=c(0.5,1.0),
     ylim=c(-0.25,0.25),
     pch=1,
     col=colors[1],
     main="ROC AUC Difference Between Test and Train")

thenames <- rownames(univarDe$orderframe)[!(rownames(univarDe$orderframe) %in% colnames(testDataFrame))]
IDeAP <- univarDe$orderframe[thenames,"ROCAUC"]
thenames <- thenames[IDeAP >= aucTHR]
IDeAP <- univarDe$orderframe[thenames,"ROCAUC"]
IDeAPTest <- univarDeTest$orderframe[thenames,"ROCAUC"]
AUCResults$IDeAP <- IDeAP
AUCResults$IDeAP_T <- IDeAPTest
diffAUC$IDeAP <-  IDeAPTest-IDeAP

points(IDeAP,IDeAPTest-IDeAP,pch=2,col=colors[2])

thenames <- rownames(univarDeSpear$orderframe)[!(rownames(univarDeSpear$orderframe) %in% colnames(testDataFrame))]
IDeAS <- univarDeSpear$orderframe[thenames,"ROCAUC"]
thenames <- thenames[IDeAS >= aucTHR]
IDeAS <- univarDeSpear$orderframe[thenames,"ROCAUC"]
IDeASTest <- univarDeSpearTest$orderframe[thenames,"ROCAUC"]
AUCResults$IDeAS <- IDeAS
AUCResults$IDeAS_T <- IDeASTest
diffAUC$IDeAS <-  IDeASTest-IDeAS

points(IDeAS,IDeASTest-IDeAS,pch=3,col=colors[3])

thenames <- rownames(univarDeDri$orderframe)[!(rownames(univarDeDri$orderframe) %in% colnames(testDataFrame))]
DIDeAP <- univarDeDri$orderframe[thenames,"ROCAUC"]
thenames <- thenames[DIDeAP >= aucTHR]
DIDeAP <- univarDeDri$orderframe[thenames,"ROCAUC"]
DIDeAPTest <- univarDeDriTest$orderframe[thenames,"ROCAUC"]
AUCResults$DIDeAP <- DIDeAP
AUCResults$DIDeAP_T <- DIDeAPTest
diffAUC$DIDeAP <-  DIDeAPTest-DIDeAP

points(DIDeAP,DIDeAPTest-DIDeAP,pch=4,col=colors[4])

thenames <- rownames(univarDeDriSpear$orderframe)[!(rownames(univarDeDriSpear$orderframe) %in% colnames(testDataFrame))]
DIDeAS <- univarDeDriSpear$orderframe[thenames,"ROCAUC"]
thenames <- thenames[DIDeAS >= aucTHR]
DIDeAS <- univarDeDriSpear$orderframe[thenames,"ROCAUC"]
DIDeASTest <- univarDeDriSpearTest$orderframe[thenames,"ROCAUC"]
AUCResults$DIDeAS <- DIDeAS
AUCResults$DIDeAS_T <- DIDeASTest
diffAUC$DIDeAS <-  DIDeASTest-DIDeAS

points(DIDeAS,DIDeASTest-DIDeAS,pch=5,col=colors[5])

thenames <- rownames(univarPCA$orderframe)[!(rownames(univarPCA$orderframe) %in% colnames(testDataFrame))]
PCA <- univarPCA$orderframe[thenames,"ROCAUC"]
thenames <- thenames[PCA >= aucTHR]
PCA <- univarPCA$orderframe[thenames,"ROCAUC"]
PCATest <- univarPCATest$orderframe[thenames,"ROCAUC"]
AUCResults$PCA <- PCA
AUCResults$PCA_T <- PCATest
diffAUC$PCA <-  PCATest-PCA

points(PCA,PCATest-PCA,pch=6,col=colors[6])

thenames <- rownames(univarEFA$orderframe)[!(rownames(univarEFA$orderframe) %in% colnames(testDataFrame))]
EFA <- univarEFA$orderframe[thenames,"ROCAUC"]
thenames <- thenames[EFA >= aucTHR]
EFA <- univarEFA$orderframe[thenames,"ROCAUC"]

EFATest <- univarEFATest$orderframe[thenames,"ROCAUC"]
AUCResults$EFA <- EFA
AUCResults$EFA_T <- EFATest
diffAUC$EFA <-  EFATest-EFA

points(EFA,EFATest-EFA,pch=7,col=colors[7])

thenames <- rownames(univarWhite$orderframe)[!(rownames(univarWhite$orderframe) %in% colnames(testDataFrame))]
WPCA <- univarWhite$orderframe[thenames,"ROCAUC"]
thenames <- thenames[WPCA >= aucTHR]
WPCA <- univarWhite$orderframe[thenames,"ROCAUC"]
WPCATest <- univarWhiteTest$orderframe[thenames,"ROCAUC"]
AUCResults$WPCA <- WPCA
AUCResults$WPCA_T <- WPCATest
diffAUC$WPCA <-  WPCATest-WPCA


points(WPCA,WPCATest-WPCA,pch=8,col=colors[8])


names=c("Raw","IDeA:P","IDeA:S","DIDeA:P","DIDeA:S","PCA","EFA","White:PCA")
pchs=c(1,2,3,4,5,6,7,8)

legend("bottomright",names,col=colors,pch=pchs,cex=0.50)

par(op)

1.11.2 Violin of differences


par(op)
par(mfrow=c(1,1),cex=0.7)

vioplot(diffAUC,
        ylim=c(-0.25,0.25),
        ylab="Test-Train",
        main="Test-Train Paired ROC AUC Difference",
        col=colors,
        cex.axis=0.6,
        las=2
)
stripchart(diffAUC, method = "jitter", col = "gray",
           vertical = TRUE, pch = 1, add = TRUE,cex=0.35)
points(1:length(diffAUC),lapply(diffAUC,mean), pch = 18, col = "yellow", cex = 2.25)

par(op)

1.11.3 Distribution of ROC AUC in latent Variables

par(op)
par(mfrow=c(1,1),cex=0.7)

colors2 <- length(AUCResults)
colors2[1] <- colors[1];
colors2[2*(1:(length(colors)-1))] <- colors[2:length(colors)]
colors2[1+2*(1:(length(colors)-1))] <- colors[2:length(colors)]
vioplot(AUCResults,
        ylim=c(0.3,1.0),
        ylab="ROC AUC",
        main="ROC AUC of Latent Variables",
        col=colors2,
        cex.axis=0.6,
        las=2
)
abline(h=0.5,col="black")
stripchart(AUCResults, method = "jitter", col = "gray",
           vertical = TRUE, pch = 1, add = TRUE,cex=0.35)
points(1:length(AUCResults),lapply(AUCResults,mean), pch = 18, col = "yellow", cex = 2.25)

par(op)

1.11.4 Differences between train and test Balanced Accuracy

par(op)
par(mfrow=c(1,1),cex=0.7)
BACCTHR <- 0.5
BACCResults <- list();
diffBACC <- list();
thenames <- rownames(univar$orderframe)[(rownames(univar$orderframe) %in% colnames(testDataFrame))]
rawBACC <- univar$orderframe[thenames,"BACC"]
thenames <- thenames[rawBACC >= BACCTHR]
rawBACC <- univar$orderframe[thenames,"BACC"]
rawBACCTest <- univarTest$orderframe[thenames,"BACC"]
BACCResults$RAW <- rawBACCTest
diffBACC$RAW <-  rawBACCTest-rawBACC
plot(rawBACC,rawBACCTest-rawBACC,
     xlab="TRAIN Balanced Acc",
     ylab="Test:BACC-Train:BACC",
     xlim=c(0.5,1.0),
     ylim=c(-0.25,0.25),
     pch=1,
     col=colors[1],
     main="Balanced Acc Difference Between Test and Train")

thenames <- rownames(univarDe$orderframe)[!(rownames(univarDe$orderframe) %in% colnames(testDataFrame))]
IDeAP <- univarDe$orderframe[thenames,"BACC"]
thenames <- thenames[IDeAP >= BACCTHR]
IDeAP <- univarDe$orderframe[thenames,"BACC"]
IDeAPTest <- univarDeTest$orderframe[thenames,"BACC"]
BACCResults$IDeAP <- IDeAP
BACCResults$IDeAP_T <- IDeAPTest
diffBACC$IDeAP <-  IDeAPTest-IDeAP

points(IDeAP,IDeAPTest-IDeAP,pch=2,col=colors[2])

thenames <- rownames(univarDeSpear$orderframe)[!(rownames(univarDeSpear$orderframe) %in% colnames(testDataFrame))]
IDeAS <- univarDeSpear$orderframe[thenames,"BACC"]
thenames <- thenames[IDeAS >= BACCTHR]
IDeAS <- univarDeSpear$orderframe[thenames,"BACC"]
IDeASTest <- univarDeSpearTest$orderframe[thenames,"BACC"]
BACCResults$IDeAS <- IDeAS
BACCResults$IDeAS_T <- IDeASTest
diffBACC$IDeAS <-  IDeASTest-IDeAS

points(IDeAS,IDeASTest-IDeAS,pch=3,col=colors[3])

thenames <- rownames(univarDeDri$orderframe)[!(rownames(univarDeDri$orderframe) %in% colnames(testDataFrame))]
DIDeAP <- univarDeDri$orderframe[thenames,"BACC"]
thenames <- thenames[DIDeAP >= BACCTHR]
DIDeAP <- univarDeDri$orderframe[thenames,"BACC"]
DIDeAPTest <- univarDeDriTest$orderframe[thenames,"BACC"]
BACCResults$DIDeAP <- DIDeAP
BACCResults$DIDeAP_T <- DIDeAPTest
diffBACC$DIDeAP <-  DIDeAPTest-DIDeAP

points(DIDeAP,DIDeAPTest-DIDeAP,pch=4,col=colors[4])

thenames <- rownames(univarDeDriSpear$orderframe)[!(rownames(univarDeDriSpear$orderframe) %in% colnames(testDataFrame))]
DIDeAS <- univarDeDriSpear$orderframe[thenames,"BACC"]
thenames <- thenames[DIDeAS >= BACCTHR]
DIDeAS <- univarDeDriSpear$orderframe[thenames,"BACC"]
DIDeASTest <- univarDeDriSpearTest$orderframe[thenames,"BACC"]
BACCResults$DIDeAS <- DIDeAS
BACCResults$DIDeAS_T <- DIDeASTest
diffBACC$DIDeAS <-  DIDeASTest-DIDeAS

points(DIDeAS,DIDeASTest-DIDeAS,pch=5,col=colors[5])

thenames <- rownames(univarPCA$orderframe)[!(rownames(univarPCA$orderframe) %in% colnames(testDataFrame))]
PCA <- univarPCA$orderframe[thenames,"BACC"]
thenames <- thenames[PCA >= BACCTHR]
PCA <- univarPCA$orderframe[thenames,"BACC"]
PCATest <- univarPCATest$orderframe[thenames,"BACC"]
BACCResults$PCA <- PCA
BACCResults$PCA_T <- PCATest
diffBACC$PCA <-  PCATest-PCA

points(PCA,PCATest-PCA,pch=6,col=colors[6])

thenames <- rownames(univarEFA$orderframe)[!(rownames(univarEFA$orderframe) %in% colnames(testDataFrame))]
EFA <- univarEFA$orderframe[thenames,"BACC"]
thenames <- thenames[EFA >= BACCTHR]
EFA <- univarEFA$orderframe[thenames,"BACC"]

EFATest <- univarEFATest$orderframe[thenames,"BACC"]
BACCResults$EFA <- EFA
BACCResults$EFA_T <- EFATest
diffBACC$EFA <-  EFATest-EFA

points(EFA,EFATest-EFA,pch=7,col=colors[7])

thenames <- rownames(univarWhite$orderframe)[!(rownames(univarWhite$orderframe) %in% colnames(testDataFrame))]
WPCA <- univarWhite$orderframe[thenames,"BACC"]
thenames <- thenames[WPCA >= BACCTHR]
WPCA <- univarWhite$orderframe[thenames,"BACC"]
WPCATest <- univarWhiteTest$orderframe[thenames,"BACC"]
BACCResults$WPCA <- WPCA
BACCResults$WPCA_T <- WPCATest
diffBACC$WPCA <-  WPCATest-WPCA


points(WPCA,WPCATest-WPCA,pch=8,col=colors[8])


names=c("Raw","IDeA:P","IDeA:S","DIDeA:P","DIDeA:S","PCA","EFA","White:PCA")
pchs=c(1,2,3,4,5,6,7,8)

legend("bottomright",names,col=colors,pch=pchs,cex=0.50)

par(op)

1.11.5 Violin of differences


par(op)
par(mfrow=c(1,1),cex=0.7)

vioplot(diffBACC,
        ylim=c(-0.25,0.25),
        ylab="Test-Train",
        main="Test-Train Paired Balanced Acc Difference",
        col=colors,
        cex.axis=0.6,
        las=2
)
stripchart(diffBACC, method = "jitter", col = "gray",
           vertical = TRUE, pch = 1, add = TRUE,cex=0.35)
points(1:length(diffBACC),lapply(diffBACC,mean), pch = 18, col = "yellow", cex = 2.25)

par(op)

1.11.6 Distribution of Balanced Acc in latent Variables

par(op)
par(mfrow=c(1,1),cex=0.7)

colors2 <- length(BACCResults)
colors2[1] <- colors[1];
colors2[2*(1:(length(colors)-1))] <- colors[2:length(colors)]
colors2[1+2*(1:(length(colors)-1))] <- colors[2:length(colors)]
vioplot(BACCResults,
        ylim=c(0.3,1.0),
        ylab="Balanced Acc",
        main="Balanced Acc of Latent Variables",
        col=colors2,
        cex.axis=0.6,
        las=2
)
abline(h=0.5,col="black")
stripchart(BACCResults, method = "jitter", col = "gray",
           vertical = TRUE, pch = 1, add = TRUE,cex=0.35)
points(1:length(BACCResults),lapply(BACCResults,mean), pch = 18, col = "yellow", cex = 2.25)

par(op)

1.11.7 Differences between train and test IDI

par(op)
par(mfrow=c(1,1),cex=0.7)

testIDI <- list();
diffIDI <- list();
rawIDI <- univar$orderframe$IDI
rawIDITest <- univarTest$orderframe$IDI
testIDI$RAW <- rawIDITest
diffIDI$RAW <-  rawIDITest-rawIDI
plot(rawIDI,rawIDITest-rawIDI,
     xlab="TRAIN Test IDI",
     ylab="Test:IDI-Train:IDI",
     xlim=c(0,0.5),
     ylim=c(-0.2,0.2),
     pch=1,
     col=colors[1],
     main="Predict IDI Difference Between Test and Train")

IDeAP <- univarDe$orderframe$IDI[!(rownames(univarDe$orderframe) %in% colnames(testDataFrame))]
IDeAPTest <-univarDeTest$orderframe$IDI[!(rownames(univarDe$orderframe) %in% colnames(testDataFrame))]
testIDI$IDeAP <- IDeAP
testIDI$IDeAP_T <- IDeAPTest
diffIDI$IDeAP <-  IDeAPTest-IDeAP

points(IDeAP,IDeAPTest-IDeAP,pch=2,col=colors[2])

IDeAS <- univarDeSpear$orderframe$IDI[!(rownames(univarDeSpearTest$orderframe) %in% colnames(testDataFrame))]
IDeASTest <- univarDeSpearTest$orderframe$IDI[!(rownames(univarDeSpearTest$orderframe) %in% colnames(testDataFrame))]
testIDI$IDeAS <- IDeAS
testIDI$IDeAS_T <- IDeASTest
diffIDI$IDeAS <-  IDeASTest-IDeAS

points(IDeAS,IDeASTest-IDeAS,pch=3,col=colors[3])

DIDeAP <- univarDeDri$orderframe$IDI[!(rownames(univarDeDriTest$orderframe) %in% colnames(testDataFrame))]
DIDeAPTest <- univarDeDriTest$orderframe$IDI[!(rownames(univarDeDriTest$orderframe) %in% colnames(testDataFrame))]
testIDI$DIDeAP <- DIDeAP
testIDI$DIDeAP_T <- DIDeAPTest
diffIDI$DIDeAP <-  DIDeAPTest-DIDeAP

points(DIDeAP,DIDeAPTest-DIDeAP,pch=4,col=colors[4])

DIDeAS <- univarDeDriSpear$orderframe$IDI[!(rownames(univarDeDriSpearTest$orderframe) %in% colnames(testDataFrame))]
DIDeASTest <- univarDeDriSpearTest$orderframe$IDI[!(rownames(univarDeDriSpearTest$orderframe) %in% colnames(testDataFrame))]
testIDI$DIDeAS <- DIDeAS
testIDI$DIDeAS_T <- DIDeASTest
diffIDI$DIDeAS <-  DIDeASTest-DIDeAS

points(DIDeAS,DIDeASTest-DIDeAS,pch=5,col=colors[5])

PCA <- univarPCA$orderframe$IDI[!(rownames(univarPCA$orderframe) %in% colnames(testDataFrame))]
PCATest <- univarPCATest$orderframe$IDI[!(rownames(univarPCA$orderframe) %in% colnames(testDataFrame))]
testIDI$PCA <- PCA
testIDI$PCA_T <- PCATest
diffIDI$PCA <-  PCATest-PCA

points(PCA,PCATest-PCA,pch=6,col=colors[6])

EFA <- univarEFA$orderframe$IDI[!(rownames(univarEFA$orderframe) %in% colnames(testDataFrame))]
EFATest <- univarEFATest$orderframe$IDI[!(rownames(univarEFA$orderframe) %in% colnames(testDataFrame))]
testIDI$EFA <- EFA
testIDI$EFA_T <- EFATest
diffIDI$EFA <-  EFATest-EFA

points(EFA,EFATest-EFA,pch=7,col=colors[7])

WPCA <- univarWhite$orderframe$IDI[!(rownames(univarWhite$orderframe) %in% colnames(testDataFrame))]
WPCATest <- univarWhiteTest$orderframe$IDI[!(rownames(univarWhite$orderframe) %in% colnames(testDataFrame))]
testIDI$WPCA <- WPCA
testIDI$WPCA_T <- WPCATest
diffIDI$WPCA <-  WPCATest-WPCA


points(WPCA,WPCATest-WPCA,pch=8,col=colors[8])


names=c("Raw","IDeA:P","IDeA:S","DIDeA:P","DIDeA:S","PCA","EFA","White:PCA")
pchs=c(1,2,3,4,5,6,7,8)

legend("bottomright",names,col=colors,pch=pchs,cex=0.50)

par(op)

1.11.8 Violin of differences


par(op)
par(mfrow=c(1,1),cex=0.7)

vioplot(diffIDI,
        ylim=c(-0.2,0.2),
        ylab="Test-Train",
        main="Test-Train Paired Predict IDI Difference",
        col=colors,
        cex.axis=0.6,
        las=2
)
stripchart(diffIDI, method = "jitter", col = "gray",
           vertical = TRUE, pch = 1, add = TRUE,cex=0.35)
points(1:length(diffIDI),lapply(diffIDI,mean), pch = 18, col = "yellow", cex = 2.25)

par(op)

1.11.9 Distribution of Predict IDI in latent Variables

par(op)
par(mfrow=c(1,1),cex=0.7)

colors2 <- length(testIDI)
colors2[1] <- colors[1];
colors2[2*(1:(length(colors)-1))] <- colors[2:length(colors)]
colors2[1+2*(1:(length(colors)-1))] <- colors[2:length(colors)]
vioplot(testIDI,
        ylim=c(0.0,0.5),
        ylab="Predict IDI",
        main="Predict IDI of Latent Variables",
        col=colors2,
        cex.axis=0.6,
        las=2
)
stripchart(testIDI, method = "jitter", col = "gray",
           vertical = TRUE, pch = 1, add = TRUE,cex=0.35)
points(1:length(testIDI),lapply(testIDI,mean), pch = 18, col = "yellow", cex = 2.25)

par(op)

1.11.10 The tables


pander::pander(univarTest$orderframe[1:TopVariables,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP IDI ROCAUC cStatCorr
std_6th_delta_delta 23.8 0.251 23.5 0.172 0.957 0.140 0.706 0.829
std_6th_delta 24.5 0.245 24.3 0.183 0.780 0.112 0.681 0.820
std_7th_delta_delta 23.7 0.255 23.4 0.201 0.965 0.134 0.728 0.820
std_8th_delta 24.4 0.243 24.1 0.164 0.753 0.136 0.707 0.817
std_8th_delta_delta 23.7 0.240 23.4 0.163 0.863 0.147 0.738 0.815
pander::pander(univarDeTest$orderframe[1:TopVariables,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP IDI ROCAUC cStatCorr
La_locShimmer 1.5596 0.0944 1.469 0.0671 0.838 0.1230 0.691 0.782
La_std_9th_delta 0.0538 0.1553 -0.089 0.1182 0.980 0.1005 0.632 0.779
La_std_3rd_delta -0.1323 0.1151 -0.257 0.1639 0.152 0.1346 0.698 0.766
La_std_4th_delta -1.4084 0.1246 -1.510 0.1118 0.586 0.0885 0.667 0.744
La_tqwt_entropy_log_dec_20 -12.7481 0.1041 -12.688 0.0674 0.640 0.0147 0.587 0.743
pander::pander(univarDeSpearTest$orderframe[1:TopVariables,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP IDI ROCAUC cStatCorr
std_9th_delta 24.366 0.2426 24.100 0.1931 9.74e-01 0.1393 0.731 0.803
La_locShimmer 1.560 0.0944 1.469 0.0671 8.38e-01 0.1230 0.691 0.782
La_std_3rd_delta -0.132 0.1151 -0.257 0.1639 1.52e-01 0.1346 0.698 0.766
La_tqwt_entropy_shannon_dec_29 0.316 0.9831 -2.088 10.7182 7.49e-08 0.0467 0.638 0.760
La_std_6th_delta 0.749 0.1292 0.646 0.0936 7.61e-01 0.0772 0.683 0.754
pander::pander(univarDeDriTest$orderframe[1:TopVariables,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP IDI ROCAUC cStatCorr
std_6th_delta_delta 23.804 0.251 23.507 0.172 0.9567 0.1397 0.706 0.829
La_std_9th_delta 0.251 0.128 0.120 0.133 0.1993 0.1326 0.773 0.791
La_tqwt_entropy_log_dec_29 0.168 0.278 0.303 0.227 0.0027 0.0541 0.696 0.767
La_std_3rd_delta -0.132 0.115 -0.257 0.164 0.1525 0.1346 0.698 0.766
La_tqwt_entropy_log_dec_20 -43.936 0.114 -43.841 0.101 0.5730 0.1093 0.701 0.758
pander::pander(univarDeDriSpearTest$orderframe[1:TopVariables,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP IDI ROCAUC cStatCorr
La_std_9th_delta 0.251 0.1283 0.120 0.1334 0.199 0.1326 0.773 0.791
La_std_3rd_delta -0.132 0.1151 -0.257 0.1639 0.152 0.1346 0.698 0.766
La_apq3Shimmer -0.675 0.0981 -0.590 0.0667 0.935 0.0732 0.628 0.755
La_std_6th_delta 0.749 0.1292 0.646 0.0936 0.761 0.0772 0.683 0.754
tqwt_kurtosisValue_dec_20 28.527 0.2481 28.335 0.1746 0.590 0.1234 0.705 0.751
pander::pander(univarPCATest$orderframe[1:TopVariables,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP IDI ROCAUC cStatCorr
RC5 10.9 43.0 -31.912 28.0 0.856205 0.166 0.740 0.806
mean_MFCC_2nd_coef 22.8 16.3 3.623 27.8 0.000469 0.160 0.767 0.743
RC1 -21.9 128.6 64.220 76.6 0.102498 0.127 0.776 0.729
RC2 -18.9 96.1 55.663 92.3 0.639001 0.111 0.683 0.721
mean_delta_log_energy -11.9 17.5 -0.184 20.6 0.002515 0.053 0.701 0.718
pander::pander(univarEFATest$orderframe[1:TopVariables,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP IDI ROCAUC cStatCorr
MR5 10.83 42.87 -31.80 27.90 0.858212 0.166 0.740 0.805
MR13 2.02 8.95 -5.94 9.04 0.692884 0.144 0.728 0.744
mean_MFCC_2nd_coef 22.79 16.29 3.62 27.83 0.000469 0.160 0.767 0.743
MR1 -21.84 128.27 64.14 76.50 0.099106 0.128 0.778 0.729
MR2 -18.94 96.03 55.62 92.23 0.641580 0.111 0.683 0.721
pander::pander(univarWhiteTest$orderframe[1:TopVariables,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP IDI ROCAUC cStatCorr
mean_MFCC_2nd_coef 22.79 16.287 3.623 27.832 4.69e-04 0.1597 0.767 0.743
L13 -2.45 0.956 -3.201 0.927 8.17e-01 0.0374 0.593 0.729
mean_delta_log_energy -11.91 17.465 -0.184 20.646 2.52e-03 0.0530 0.701 0.718
tqwt_medianValue_dec_10 -1.97 16.122 -10.549 13.287 1.63e-05 -0.0216 0.475 0.710
L27 6.30 0.934 7.029 0.999 7.26e-01 0.0866 0.648 0.699

topUni <- univar$orderframe$Name[1:TopVariables]
topDe <- univarDe$orderframe$Name[1:TopVariables]
topDeSpear <- univarDeSpear$orderframe$Name[1:TopVariables]
topDeDri <- univarDeDri$orderframe$Name[1:TopVariables]
topDeDriSpear <- univarDeDriSpear$orderframe$Name[1:TopVariables]
topPCA <- univarPCA$orderframe$Name[1:TopVariables]
topEFA <- univarEFA$orderframe$Name[1:TopVariables]
topPCAWhite <- univarWhite$orderframe$Name[1:TopVariables]

1.11.11 Model of top variables

par(mfrow=c(1,2),cex=0.6)

lmRAW <- glm(paste(outcome,"~."),
             trainDataFrame[,c(outcome,topUni)],
             family="binomial")
prRaw <- predictionStats_binary(cbind(testDataFrame[,outcome],predict(lmRAW,testDataFrame)),"Top Raw",cex=0.75)


lmDe <- glm(paste(outcome,"~."),
            DEdataframe[,c(outcome,topDe)],
            family="binomial")
prDe <- predictionStats_binary(cbind(predTestDe[,outcome],predict(lmDe,predTestDe)),"Top IDeA:P",cex=0.75)


lmDeSpear <- glm(paste(outcome,"~."),
            DEdataframeSpear[,c(outcome,topDeSpear)],
            family="binomial")
prSpear <- predictionStats_binary(cbind(predTestDeSpear[,outcome],predict(lmDeSpear,predTestDeSpear)),"Top IDeA:S",cex=0.75)


lmDri <- glm(paste(outcome,"~."),
            DriDEdataframe[,c(outcome,topDeDri)],
            family="binomial")
prDri <- predictionStats_binary(cbind(predTestDe[,outcome],predict(lmDri,predTestDri)),"Top DIDeA:P",cex=0.75)


lmDriSpear <- glm(paste(outcome,"~."),
            DriDEdataframeSpear[,c(outcome,topDeDriSpear)],
            family="binomial")
prDriSpear <- predictionStats_binary(cbind(predTestDriSpear[,outcome],predict(lmDriSpear,predTestDriSpear)),"Top DIDeA:S",cex=0.7)



lmPCA <- glm(paste(outcome,"~."),
            PCA_Train[,c(outcome,topPCA)],
            family="binomial")
prPCA <- predictionStats_binary(cbind(PCA_Predicted[,outcome],predict(lmPCA,PCA_Predicted)),"Top PCA",cex=0.75)



lmEFA <- glm(paste(outcome,"~."),
            EFA_Train[,c(outcome,topEFA)],
            family="binomial")
prEFA <- predictionStats_binary(cbind(EFA_Predicted[,outcome],predict(lmEFA,EFA_Predicted)),"Top EFA",cex=0.75)



lmPCAW <- glm(paste(outcome,"~."),
            PCAWhite_Train[,c(outcome,topPCAWhite)],
            family="binomial")
prWPCA <- predictionStats_binary(cbind(PCAWhitePredicted[,outcome],predict(lmPCAW,PCAWhitePredicted)),"Top White:PCA",cex=0.75)

par(op)

1.11.12 The Performance Tables and Plots


par(cex=0.6)

 aucs <- prRaw$aucs
  aucs <- rbind(aucs,prDe$aucs)
  aucs <- rbind(aucs,prSpear$aucs)
  aucs <- rbind(aucs,prDri$aucs)
  aucs <- rbind(aucs,prDriSpear$aucs)
  aucs <- rbind(aucs,prPCA$aucs)
  aucs <- rbind(aucs,prEFA$aucs)
  aucs <- rbind(aucs,prWPCA$aucs)

  
  rownames(aucs) <- c("RAW",
                        "IDeA:P",
                        "IDeA:S",
                        "DIDeA:P",
                        "DIDeA:S",
                        "PCA",
                        "EFA",
                        "WPCA"
                        )
  
  pander::pander(aucs)
  est lower upper
RAW 0.720 0.629 0.811
IDeA:P 0.728 0.626 0.830
IDeA:S 0.777 0.690 0.864
DIDeA:P 0.794 0.711 0.877
DIDeA:S 0.790 0.699 0.882
PCA 0.798 0.712 0.884
EFA 0.796 0.708 0.884
WPCA 0.683 0.567 0.799
  
  bpAUC <- barPlotCiError(as.matrix(aucs),
                          metricname = "ROC AUC",
                          thesets = "Test AUC",
                          themethod = rownames(aucs),
                          main = "ROC AUC",
                          offsets = c(0.5,1),
                          scoreDirection = ">",
                          ho=0.5,
                          args.legend = list(bg = "white",x="bottomleft",inset=c(0.0,0),cex=0.5),
                          col = terrain.colors(nrow(aucs))
                          )


  
 berror <- prRaw$berror
  berror <- rbind(berror,prDe$berror)
  berror <- rbind(berror,prSpear$berror)
  berror <- rbind(berror,prDri$berror)
  berror <- rbind(berror,prDriSpear$berror)
  berror <- rbind(berror,prPCA$berror)
  berror <- rbind(berror,prEFA$berror)
  berror <- rbind(berror,prWPCA$berror)


  rownames(berror) <- rownames(aucs)
  pander::pander(berror)
  50% 2.5% 97.5%
RAW 0.434 0.348 0.513
IDeA:P 0.339 0.241 0.427
IDeA:S 0.354 0.267 0.449
DIDeA:P 0.325 0.234 0.425
DIDeA:S 0.344 0.256 0.435
PCA 0.250 0.164 0.338
EFA 0.259 0.167 0.360
WPCA 0.362 0.269 0.453

  bpBER <- barPlotCiError(as.matrix(berror),
                          metricname = "Balanced Error Rate",
                          thesets = "Test BER",
                          themethod = rownames(aucs),
                          main = "Balanced Error Rate",
                          offsets = c(0.5,1),
                          scoreDirection = "<",
                          ho=0.5,
                          args.legend = list(bg = "white",x="topleft",inset=c(0.0,0),cex=0.5),
                          col = terrain.colors(nrow(aucs))
                          )

  par(op)